/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.searcher; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import net.nutch.analysis.NutchDocumentAnalyzer; import net.nutch.analysis.CommonGrams; import net.nutch.searcher.Query.*; import java.io.IOException; /** Translation from Nutch queries to Lucene queries. */ public class QueryTranslator { private QueryTranslator() {} // can't construct private static float URL_BOOST = 4.0f; private static float ANCHOR_BOOST = 2.0f; private static int SLOP = Integer.MAX_VALUE; private static float PHRASE_BOOST = 1.0f; /** Set the boost factor for url matches, relative to content and anchor * matches */ public static void setUrlBoost(float boost) { URL_BOOST = boost; } /** Set the boost factor for title/anchor matches, relative to url and * content matches. */ public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; } /** Set the boost factor for sloppy phrase matches relative to unordered term * matches. */ public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; } /** Set the maximum number of terms permitted between matching terms in a * sloppy phrase match. */ public static void setSlop(int slop) { SLOP = slop; } private static Interface DEFAULT_IMPL = new Default(); /** Set the default query translation implementation. */ public static void setDefaultTranslator(Interface impl) { DEFAULT_IMPL = impl; } /** Get the default query translation implementation. */ public static Interface getDefaultTranslator() { return DEFAULT_IMPL; } /** Translate a Nutch query into a Lucene query using the default * translator. */ public static org.apache.lucene.search.Query translate(Query input) { return DEFAULT_IMPL.translate(input); } /** Query translation interface. */ public interface Interface { /** Translate a Nutch query into a Lucene query. */ org.apache.lucene.search.Query translate(Query input); } /** The default query translator. Searches all fields with both sloppy * phrases and individual terms.*/ public static class Default implements Interface { public org.apache.lucene.search.Query translate(Query input) { BooleanQuery output = new BooleanQuery(); addClauses(input, "url", output, URL_BOOST, SLOP); addClauses(input, "anchor", output, ANCHOR_BOOST, NutchDocumentAnalyzer.INTER_ANCHOR_GAP); addClauses(input, "content", output, 1.0f, SLOP); //System.out.println(output.toString("content")); return output; } /** Add all terms from a Nutch query to a Lucene query, searching the named * field as a sloppy phrase and as individual terms.. */ private static void addClauses(Query input, String field, BooleanQuery output, float boost, int slop) { BooleanQuery requirements = new BooleanQuery(); PhraseQuery sloppyPhrase = new PhraseQuery(); sloppyPhrase.setSlop(slop); sloppyPhrase.setBoost(boost * PHRASE_BOOST); int sloppyTerms = 0; Clause[] clauses = input.getClauses(); for (int i = 0; i < clauses.length; i++) { Clause clause = clauses[i]; if (clause.isPhrase()) { // optimize phrase clauses String[] opt = CommonGrams.optimizePhrase(clause.getPhrase(), field); if (opt.length==1) { clause = new Clause(new Term(opt[0]), clause.isRequired(), clause.isProhibited()); } else { clause = new Clause(new Phrase(opt), clause.isRequired(), clause.isProhibited()); } } if (clause.isRequired()) { if (!clause.isPhrase()) { sloppyPhrase.add(luceneTerm(field, clause.getTerm())); sloppyTerms++; requirements.add(termQuery(field, clause.getTerm(), boost), true, false); } else { requirements.add(exactPhrase(clause.getPhrase(), field, boost), true, false); } } else if (clause.isProhibited()) { if (!clause.isPhrase()) { output.add(new TermQuery(luceneTerm(field, clause.getTerm())), false, true); } else { output.add(exactPhrase(clause.getPhrase(), field, boost), false, true); } } } if (sloppyTerms > 1) { requirements.add(sloppyPhrase, true, false); } output.add(requirements, false, false); } private static TermQuery termQuery(String field, Term term, float boost) { TermQuery result = new TermQuery(luceneTerm(field, term)); result.setBoost(boost); return result; } /** Utility to construct a Lucene exact phrase query for a Nutch phrase. */ private static PhraseQuery exactPhrase(Phrase nutchPhrase, String field, float boost) { Term[] terms = nutchPhrase.getTerms(); PhraseQuery exactPhrase = new PhraseQuery(); for (int i = 0; i < terms.length; i++) { exactPhrase.add(luceneTerm(field, terms[i])); } exactPhrase.setBoost(boost); return exactPhrase; } /** Utility to construct a Lucene Term given a Nutch query term and field. */ private static org.apache.lucene.index.Term luceneTerm(String field, Term term) { return new org.apache.lucene.index.Term(field, term.toString()); } }}